import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor#线程池，可以异步调用
from parsel import Selector
from pymysqlpool import ConnectionPool
import time
# DBUtils
import pymysql
from DBUtils.PersistentDB import PersistentDB
from DBUtils.PooledDB import PooledDB

import re
# ProcessPoolExecutor 进程池

def get_word_list(url):
    # 文章列表
    res = requests.get(url).content
    sel = etree.HTML(res)
    # map 遍历后面的列表，挨个传入前面的函数
    # sumbit 直接将后面的参数传给前面的函数
    #get_word_info(sel.xpath('//a[@class="archive-title"]/@href')[0])
    urls=sel.xpath('//a[@class="archive-title"]/@href')
    #for url in urls:
    #    get_word_info(url)
    executor.map(get_word_info, sel.xpath('//a[@class="archive-title"]/@href'))
    # info_url_list　并发的方式提交给另外方法进行处理
    # 　线程 协程 进程
    # 下一页数据
    next_href = sel.xpath('//a[@class="next page-numbers"]/@href')
    if next_href:
        url = next_href[0]
        return url
    else:
        return False

# 文章详细页面
def get_word_info(url):
    print(url)
    try:
        # 1
        #db=pool.get_connection()
        #cursor=db.cursor()
        # 2
        conn = pool1.connection()
        cursor=conn.cursor()
        # 3
        # cursor=conn2.cursor()
        res = requests.get(url,headers=headers).content
        sel = etree.HTML(res)
        # 作者
        author=sel.xpath('//*[@id="author-bio"]/h3/a/text()|//div[@class="copyright-area"]/a[1]/text()')
        if author:
            author=author[0]
        else:
            author=''
        # 标题
        title=sel.xpath('//div[@class="entry-header"]/h1/text()')[0].strip()
        # 内容
        contents_list=sel.xpath('//div[@class="entry"]/p/text()|//div[@class="entry"]/ul/li/text()|//div[@class="entry"]/h2/text()|//div[@class="entry"]/ol/li/text()')
        contents='\n'.join(contents_list)
        #分类
        category=sel.xpath('//p[@class="entry-meta-hide-on-mobile"]/a[@rel="category tag"]/text()')[0]
        # 标签
        label_list=[]
        a_items=sel.xpath(r'//p[@class="entry-meta-hide-on-mobile"]/a')
        for item in a_items:
            if re.compile('/tag/').search(item.xpath('@href')[0]):
                label_list.append(item.xpath('text()')[0])
        labels=','.join(label_list)
        #　时间
        fbrq=sel.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()')[0].strip(' ·').strip()
        sql='insert into jobble values(NULL,%s,%s,%s,%s,%s,%s,%s)'
        cursor.execute(sql,(title,contents,author,category,labels,fbrq,url))
        cursor.close()
        conn.close()
    except pymysql.err.Warning as e:
        pass

if __name__ == '__main__':
    url = 'http://blog.jobbole.com/all-posts/'
    executor = ThreadPoolExecutor(max_workers=3)
    headers={
        'Host':'blog.jobbole.com',
        'Referer':'http://blog.jobbole.com/all-posts/',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36 Qiyu/2.1.1.1'
    }
    # pymysqlpool
    #pool=ConnectionPool(host='127.0.0.1',user='root',passwd='root',db='blog_spider',charset='utf8')
    # 一个线程一个连接
    pool1 = PersistentDB(creator=pymysql, maxusage=None, ping=0, closeable=False, host='127.0.0.1', user='root',password='root', db='blog_spider', charset='utf8')
    # # 一个连接池分配连接
    #pool2 = PooledDB(creator=pymysql, maxusage=None, mincached=2, maxcached=10, maxshared=0, blocking=True, ping=0, host='127.0.0.1', user='root', password='root', db='blog_spider', charset='utf8')
    while True:
        url=get_word_list(url)
        time.sleep(1000)
        if not url:
            break